ggplot2::diamonds Data Case

DataSet

ggplot2::diamonds

Basic Statistical Transformations: Stat

ggplot(data = diamonds) +
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

# Max value is less than default stack position
ggplot(diamonds, mapping = aes(x = cut)) +
  # geom_bar(mapping = aes(fill = clarity), alpha = 2/5, position = "identity")
  geom_bar(mapping = aes(color = clarity), fill = NA, position = "identity")

base_plot <- ggplot(diamonds, mapping = aes(x = cut))
base_plot + geom_bar(mapping = aes(fill = clarity), position = "fill")

                                    # Easy for proportions comparing
base_plot + geom_bar(mapping = aes(fill = clarity), position = "dodge")

                                    # Separate overlapping objects
ggplot(diamonds) +
  # geom_point(mapping = aes(x = depth, y = price))
                                    # Many points around depth of 60 overlapped
  # geom_point(data = filter(diamonds, depth == 60), mapping = aes(x = depth, y = price))
                                    # Still hard to see where the mass of data is
  # geom_point(data = filter(diamonds, depth == 60), mapping = aes(x = depth, y = price), position = "jitter")
                                    # Spreads 511 points out making points receive random noises
  geom_jitter(data = filter(diamonds, depth == 60), mapping = aes(x = depth, y = price), width = 0.1)

                                    # Make the total spread to be 0.2 around 60
cat("Filtered Result Rows:", NROW(filter(diamonds, depth == 60)))
## Filtered Result Rows: 511
diamonds %>% filter(depth == 60) %>% count(cut)

cut: Examine the distribution of a categorical variable

A variable is categorical if it can only take one of a small set of values.(Wickham 2010)

(
  ggplot(data = diamonds) +
    geom_bar(mapping = aes(x = cut, y = ..count..))
    # geom_bar(mapping = aes(x = cut))  # Implicit Count Computing
    # Error: binwidth is not supported in bar_graph ↓
    # geom_bar(mapping = aes(x = cut), binwidth = 30)
    # stat_count(mapping = aes(x = cut))       # count transformation under the hood
)

(
  ggplot(diamonds) +
    # geom_bar(mapping = aes(x = cut, y = ..prop..))
    # Groupwise Proportion will make it work on default grouping strategy
    # which is grouped by color variable, in this data case
    geom_bar(mapping = aes(x = cut, y = ..prop.., fill = color))
    # Test if the color variable used as group definition
)

(
  ggplot(diamonds) +
    # disable all grouping strategy, makes ggplot take
    # the whole dataset as one group
    geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
)

(
  ggplot(diamonds) +
    # geom_bar(mapping = aes(x = cut, y = ..count.., fill = clarity), position = "stack")
    geom_bar(mapping = aes(x = cut, y = ..count.., fill = clarity))
    # stack is default position in geom_bar function
)

(
  ggplot(diamonds) +
    geom_bar(mapping = aes(x = cut, y = ..prop.., fill = clarity, group = 1), position = "stack")
    # position does not work in proportion plot
)

ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut, color = cut)) # Color a bar chart

ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut, fill = cut))  # Fill a bar chart

(demo_small_table <- tibble::tribble(
  ~cut,         ~freq,
  "Fair",       1000,      # Key in a table should be unique for `identity stat`
  "Good",       2000,
  "Very Good",  3000,
  "Premium",    4000,
  "Ideal",      5000
))
ggplot(data = demo_small_table) +
  geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")

price: Examine the distribution of a continuous variable

Although Bar-Chart is available for continuous variable, but stat_bin is not supported by geom_bar, which makes geom_histogram or geom_freqpoly functions is much more used for continuous variable, exploring a variety of binwidths.

ggplot(data = diamonds)+
  geom_histogram(mapping = aes(price, fill = cut), binwidth = 500)

# Get the tabular form data for continuous data
count(diamonds, cut_width(price, width = 500))

In the graph above, the tallest bar shows that almost 10,000 observations have a price value between 750 and 1750(the range is much more easy to see in the tabular summary), where

  • 750 is called the left edge
  • 1750 is called the right edge

of that bar.

cut~price: Covariation between a categorical variable and continuous variable

ggplot(diamonds) +
  geom_histogram(aes(x = price, fill = cut), bins = 30)

ggplot(diamonds) +
  geom_freqpoly(aes(x = price, color = cut), bins = 30)

Instead of displaying count, plotting in density is an approach as well, which make sure that the area under each frequency polygon is one.

ggplot(diamonds) +
  geom_freqpoly(aes(x = price, y = ..density.., color = cut), bins = 30)

cut~color: Covariantion between Two categorical Variables

ggplot(diamonds) +
  geom_count(mapping = aes(x = cut, y = color))

## This computing could also be available in table form:
diamonds %>% count(color, cut)
diamonds %>%
  count(color, cut) %>%
  ggplot() +
  geom_tile(mapping = aes(x = cut, y = color, fill = n)) +
  scale_fill_continuous(trans = 'reverse') +
  guides(fill = guide_legend(reverse = T))

carat~price: Covariation between Two Continuous Variables

Scatterplots become less useful as the size of the dataset grows, because of overplotting among points. Event transparency is still challenging for very large datasets.

(price_vs_carat <- ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
  geom_point(alpha = 1/10))

price_vs_carat + 
  scale_x_log10(
    breaks = trans_breaks("log10", function(x)  10 ^ x),
    labels = trans_format("log10", math_format(10 ^ .x))
  ) +
  scale_y_log10(
    breaks = trans_breaks("log10", function(x)  10 ^ x),
    labels = trans_format("log10", math_format(10 ^ .x))
  ) + geom_smooth(method = "lm")

price_vs_carat + coord_trans(x = "log10", y = "log10")

price_vs_carat + scale_x_log10() + scale_y_log10() + geom_smooth(method = "lm") + coord_polar()

ggplot(diamonds) +
  geom_bin2d(mapping = aes(x = carat, y = price))

ggplot(diamonds) +
  geom_hex(aes(x = carat, y = price))

ggplot(diamonds) +
  geom_boxplot(mapping = aes(x = carat, y = price, group = cut_width(carat, 0.5)))

ggplot(diamonds) +
  geom_boxplot(mapping = aes(
    x = carat,
    y = price,
    group = cut_number(carat, 20)
  ))

model <- lm(log(price) ~ log(carat), data = diamonds)
diamonds %>%
  add_residuals(model = model) %>%
  mutate(resid = exp(resid)) %>%
  ggplot() + geom_point(mapping = aes(x = carat, y = resid))

model <- lm(log(price) ~ log(carat), data = diamonds)
diamonds %>%
  add_residuals(model = model) %>%
  mutate(resid = exp(resid)) %>%
  ggplot() + geom_boxplot(mapping = aes(x = cut, y = resid))

Residual = Observed y-value - Predicted y-value

Coordinate Systems

Coordinate Transformation

A regular bar chart converted into polar coordinates produces another type of graphic: the Coxcomb plot (Wickham 2010).

bar <- ggplot(diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = cut),
    show.legend = FALSE,     # Because x-axis has represented meanings of each bar
    width = 1
  )
bar <- bar + theme(aspect.ratio = 1)   # Set aspect ratio of the panel to the picture
(bar <- bar + labs(x = NULL, y = NULL))

bar + coord_flip()

bar + coord_polar()

(bar_plot <-
   ggplot(diamonds, mapping = aes(x = "", fill = clarity)) +
   geom_bar(width = 1))

bar_plot + coord_polar(theta = "y")

bar_plot + coord_polar(theta = "x")

Zooming

(beforeZoom <- ggplot(diamonds) +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5))

beforeZoom + coord_cartesian(ylim = c(0, 30))

beforeZoom + coord_cartesian(xlim = c(-2, 4), ylim = c(0, 10))

Through the graph above, filter records by the variable of y and see them in a detailed table:

diamonds %>%
  filter(y < 3 | y > 20) %>%
  select(price, x, y, z) %>%
  arrange(y)

For the values that are unusual, replaceing them with missing values is a recommended practice, after which the graph created again could be less affected by outliers.

diamonds %>%
  mutate(
    y = ifelse(y < 3 | y > 20, NA, y) # Remove the effect from unusual values
  ) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5)
## Warning: Removed 9 rows containing non-finite values (stat_bin).

References

Wickham, Hadley. 2010. “A Layered Grammar of Graphics.” Journal of Computational and Graphical Statistics. https://www.researchgate.net/publication/228842388_A_Layered_Grammar_of_Graphics.